/*==============================================================================
FILE NAME: Create_Air_Panel_Onsite_Included.do
INPUTS: Investigations_Clean.dta, IN_onsite.dta, NSR_Permits_Clean.dta, TitleV_Permits_Clean.dta, facility_characteristics.dta, Notice_of_Violation_Clean.dta, IN_vio_cat.dta, Enforcements_Clean.dta
	From Create_Air_Panel.do: inv_temp.dta, Panel_scaffold.dta, 
OUTPUTS: 
CREATED: 16 June 2022
UPDATED: 31 July 2025
==============================================================================*/
clear all

/* Set directory if working independently through code
if c(username)=="" { //insert username
	global rootdir "" // insert root path
	global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 
*/
set more off

//First, identify RN_ids associated with IN with missing values of onsite. Drop these RN_ids later in the code.
use "$processed_data/Investigations_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
destring IN, replace
keep if Media=="AIR"
isid IN
keep IN RN_id year
sort IN
merge 1:1 IN using "$processed_data/IN_onsite.dta"
drop _merge
tab code1 if onsite==.
foreach v of varlist code2-code8{
	tab `v' if onsite==. & code1=="UML3"
}
replace onsite=0 if code1=="UML3"
keep if onsite==.
keep RN_id year
duplicates drop
gen drop_RN=1
sort RN_id year
save "$processed_data/temp_drop_RNs.dta", replace



//Using investigations dataset and unique observation information (IN), build a panel dataset of air investigations with compaint versus noncomplaint and onsite versus offsite information
use "$processed_data/investigations_with_noe_nov.dta", clear //dataset from Explore_Investigations.do
drop year
sort IN
merge 1:m IN using "$processed_data/inv_temp.dta" //Using a temp dataset from Create_Air_Panel.do
keep if _merge==3
drop _merge
gen mdate=ym(year,month)
//focus on air investigations
keep if Media=="AIR"
//all but 9 investigations with missing values for Media are before 2003
rename investigation air_investigation
keep IN complaint_inv air_investigation RN_id mdate day
duplicates drop
//merge in onsite information
sort IN
merge 1:1 IN using "$processed_data/IN_onsite.dta"
drop _merge
foreach v of varlist code2-code8{
	tab `v' if onsite==. & code1=="UML3"
}
replace onsite=0 if code1=="UML3"
isid IN
//at this point IN uniquely identifies obs
gen offsite=0
replace offsite=1 if onsite==0
gen noncomp_onsite=.
replace noncomp_onsite=1 if complaint_inv==0 & onsite==1
replace noncomp_onsite=0 if complaint_inv==0 & onsite==0
label var noncomp_onsite "=1 for non-complaint on-site inv; =0 for non-complaint off-site inv"
gen comp_onsite=.
replace comp_onsite=1 if complaint_inv==1 & onsite==1
replace comp_onsite=0 if complaint_inv==1 & onsite==0
label var comp_onsite "=1 for complaint on-site inv; =0 for complaint off-site inv"
tab noncomp_onsite, missing
tab comp_onsite, missing
//no variation in complaint investigations
gen noncomp_offsite=.
replace noncomp_offsite=0 if noncomp_onsite==1
replace noncomp_offsite=1 if noncomp_onsite==0
gen comp_offsite=.
replace comp_offsite=1 if complaint_inv==1 & onsite==0
replace comp_offsite=0 if complaint_inv==1 & onsite==1
label var comp_offsite "=1 for complaint off-site inv; =0 for complaint on-site inv"

sort RN_id mdate day
//next step will give us # by RN-day
collapse (sum) noncomp_onsite noncomp_offsite comp_onsite comp_offsite air_investigation, by(RN_id mdate day)
//now create days investigated
sort RN_id mdate
gen day_air_noncomp_onsite=0
replace day_air_noncomp_onsite=1 if noncomp_onsite>0
gen day_air_noncomp_offsite=0
replace day_air_noncomp_offsite=1 if noncomp_offsite>0
gen day_air_inv=0
replace day_air_inv=1 if air_investigation>0
gen day_air_comp_onsite=0
replace day_air_comp_onsite=1 if comp_onsite>0
gen day_air_comp_offsite=0
replace day_air_comp_offsite=1 if comp_offsite>0
keep RN_id mdate day_air_noncomp_onsite day_air_noncomp_offsite day_air_inv day_air_comp_onsite day_air_comp_offsite
sort RN_id mdate 
//now by month
collapse (sum) day_air_noncomp_onsite day_air_noncomp_offsite day_air_inv day_air_comp_onsite day_air_comp_offsite, by(RN_id mdate)
label var day_air_noncomp_onsite  "# days RN had >=1 onsite noncomplaint air investigation in month"
label var day_air_noncomp_offsite "# days RN had >=1 offsite noncomplaint air investigation in month"
label var day_air_inv "# day RN had >=1 air investigation in month"
label var day_air_comp_onsite  "# days RN had >=1 onsite complaint air investigation in month"
label var day_air_comp_offsite "# days RN had >=1 offsite complaint air investigation in month"
sort RN_id mdate
//restrict to 2003 to 2019
format mdate %tm
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)
merge 1:1 RN_id mdate using "$processed_data/Panel_scaffold.dta" //where Panel_scaffold is created in Create_Air_Panel.do
drop if _merge==1
drop _merge
foreach x of varlist day_air_noncomp_onsite day_air_noncomp_offsite day_air_inv day_air_comp_onsite day_air_comp_offsite{
replace `x'=0 if `x'==.
}
sort RN_id mdate
gen p_air_noncomp_onsite=(day_air_noncomp_onsite>0)
label var p_air_noncomp_onsite "=1 if RN had at least one onsite noncomplaint air investigation in month"
gen p_air_noncomp_offsite=(day_air_noncomp_offsite>0)
label var p_air_noncomp_offsite "=1 if RN had at least one offsite noncomplaint air investigation in month"
gen p_air_comp_onsite=(day_air_comp_onsite>0)
label var p_air_comp_onsite "=1 if RN had at least one onsite complaint air investigation in month"
gen p_air_comp_offsite=(day_air_comp_offsite>0)
label var p_air_comp_offsite "=1 if RN had at least one offsite complaint air investigation in month"
save "$processed_data/Panel_inv_types_onsite_included.dta", replace


//Include information about violation severity for investigations and add counts and binary indicators for cat A, B, and C violations to Panel_inv_onsite_included.dta.
use "$processed_data/IN_vio_cat.dta", clear //Datset produced in Explore violation severity.do
drop year
sort IN
merge 1:m IN using "$processed_data/inv_temp.dta"
keep if _merge==3
drop _merge
gen mdate=ym(year,month)
//focus on air investigations
keep if Media=="AIR"
//less than 1% of investigations after 2003 have missing values for Media
keep IN complaint_inv RN_id mdate catA catB catC any_cat
duplicates drop
//at this point IN uniquely identifies obs
sort RN_id mdate
//next step will give us # of cat A, cat B, and cat C violations by RN-mdate
collapse (sum) catA catB catC any_cat, by(RN_id mdate)
rename catA air_catA
rename catB air_catB
rename catC air_catC
rename any_cat air_any_cat

label var air_catA "# of category A violations for RN in the month"
label var air_catB "# of category B violations for RN in the month"
label var air_catC "# of category C violations for RN in the month"
label var air_any_cat "# of violations for RN in the month"
gen p_air_catA=(air_catA>0)
label var p_air_catA "=1 if RN had at least one category A violation in the month"
gen p_air_catB=(air_catB>0)
label var p_air_catB "=1 if RN had at least one category B violation in the month"
gen p_air_catC=(air_catC>0)
label var p_air_catC "=1 if RN had at least one category C violation in the month"
gen p_air_any_cat=(air_any_cat>0)
label var p_air_any_cat "=1 if RN had at least one violation in the month"
//restrict to 2003 to 2019
format mdate %tm
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)
merge 1:1 RN_id mdate using "$processed_data/Panel_inv_types_onsite_included.dta"
drop if _merge==1
drop _merge
foreach x of varlist air_catA-p_air_any_cat{
replace `x'=0 if `x'==.
}
save "$processed_data/Panel_inv_types_onsite_included.dta", replace


//Merge in NOVs
use "$processed_data/Notice_of_Violation_Clean.dta", clear 
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
//focus on air NOVs
keep if violation_air==1
//all but 4 observations that have missing violation_air occur before 2003
keep RN_id VN day month year
gen mdate=ym(year,month)
format mdate %tm
duplicates drop
//the same RN VN appears on different dates
//create a count of unique VNs for each RN-month
keep RN_id VN mdate
duplicates drop
sort RN_id mdate
collapse (count) VN, by(RN_id mdate)
rename VN air_novs
label var air_novs "number of air NOVs received by RN in month"
sort RN_id mdate
merge 1:1 RN_id mdate using "$processed_data/Panel_inv_types_onsite_included.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace air_novs=0 if air_novs==.
sort RN_id mdate
gen p_air_nov=(air_novs>0)
label var p_air_nov "=1 if RN had at least one NOV in month"
save "$processed_data/Panel_inv_types_onsite_included_nov.dta", replace


//Merge in NOEs
use "$processed_data/Enforcements_Clean.dta", clear
gen RN_id=substr(RN,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
keep RN_id VN day month year enforcement_air
gen mdate=ym(year,month)
format mdate %tm
//focus on air NOEs
keep if enforcement_air==1
keep RN_id VN mdate
duplicates drop
sort RN_id mdate
collapse (count) VN, by(RN_id mdate)
rename VN air_noes
label var air_noes "number of air NOEs received by RN in month"
sort RN_id mdate
merge 1:1 RN_id mdate using "$processed_data/Panel_inv_types_onsite_included_nov.dta"
unique RN_id if _merge==3
drop if _merge==1
drop _merge
replace air_noes=0 if air_noes==.
sort RN_id mdate
gen p_air_noe=(air_noes>0)
label var p_air_noe "=1 if RN had at least one air NOE in month"
save "$processed_data/Panel_inv_types_onsite_included_nov_noe.dta", replace


//Create a variable that identifies RNs never investigated bt 2003 & 2019
use "$processed_data/Panel_inv_types_onsite_included_nov_noe.dta", clear
keep RN_id day_air_inv
sort RN_id
collapse (sum) day_air_inv, by(RN_id)
gen never_air_inv=0
replace never_air_inv=1 if day_air_inv==0
label var never_air_inv "=1 if RN never has air investigation bt 2003 & 2019"
sort RN_id
save "$processed_data/temp.dta", replace
use "$processed_data/Panel_inv_types_onsite_included_nov_noe.dta"
merge m:1 RN_id using "$processed_data/temp.dta"
drop _merge
order NSR, first
order never_air_inv, after(NSR)
drop day_air_inv day_air_noncomp_offsite day_air_noncomp_onsite
save "$processed_data/Panel_inv_types_onsite_included_nov_noe.dta", replace


//Create a variable that identifies RNs that ever had a complaint bt 2003 & 2019
use "$processed_data/incidents.dta", clear
keep if Media=="AIR"
gen RN_id=substr(RegulatedEntity,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
rename ComplaintIncident CIN
gen temp = date(IncidentRecDate,"MDY")
drop IncidentRecDate 
rename temp IncidentRecDate
format IncidentRecDate %td
replace IncidentRecDate = . if IncidentRecDate < 0
gen year=year(IncidentRecDate)
gen month=month(IncidentRecDate)
gen mdate=ym(year,month)
format mdate %tm
//restrict to 2003 to 2019
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)
//if RN_id shows up at this point, then facility was subject of complaint during sample period
keep RN_id
duplicates drop
gen ever_complaint=1
label var ever_complaint "=1 if RN_id had at least one air complaint b/t 2003 & 2019"
sort RN_id
merge 1:m RN_id using "$processed_data/Panel_inv_types_onsite_included_nov_noe.dta"
drop if _merge==1
replace ever_complaint=0 if _merge==2
drop _merge 
sort RN_id mdate
save "$processed_data/Panel_inv_types_onsite_included_nov_noe.dta", replace


//Remove RN_id-years in which facility had investigation for which we don't observe onsite status
use "$processed_data/Panel_inv_types_onsite_included_nov_noe.dta", clear
sort RN_id year
//if merge successful, drop RN_id-year
merge m:1 RN_id year using "$processed_data/temp_drop_RNs.dta"
keep if _merge==1
drop _merge 
sort RN_id mdate
save "$processed_data/Panel_inv_types_onsite_included_nov_noe.dta", replace


erase "$processed_data/Panel_inv_types_onsite_included_nov.dta"
erase "$processed_data/Panel_inv_types_onsite_included.dta"
erase "$processed_data/temp.dta"


use "$processed_data/Panel_inv_types_onsite_included_nov_noe.dta", clear
drop if never_air_inv==1
egen t = group(year month)
xtset RN_id t


foreach y in p_air_nov p_air_noe p_air_catA p_air_catB p_air_catC{
forv h = 0/12 {
gen `y'_`h' = f`h'.`y' - l1.`y'
}

forv h = 2/12 {
gen `y'_neg`h' = l`h'.`y' - l1.`y'
}
}

egen RN_year = group(RN year)
save "$processed_data/Panel_inv_types_onsite_included_final.dta", replace //where investigation types include onsite/offsite investigations and complaint/non-complaint investigations
